import json
import pandas as pd
import plotly.express as px
import os
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import plotly.graph_objects as go
import numpy as np
with open('dataset/sample_20200501.json', 'r') as f:
data = json.load(f)
print(len(data["events"]))
events=data["events"]
df = pd.DataFrame(events)
df
## selecting some columns
We are going to separete the elements stored in each tag list into new rows.
df["tags"][0:5]
df_tags=df.explode('tags')
df_tags
g_tags=df_tags.groupby(['tags']).size().reset_index()
g_tags=g_tags.rename(columns={0: "number_of_times"}).sort_values(by=['number_of_times'], ascending=False)
g_tags
fig = px.line(g_tags, x="tags", y="number_of_times", title='Number of times that each tag appears')
fig.show()
Given a description cell, with a list of descriptions, we will create new row per element in that list.
df["descriptions"][0:5]
df_descriptions=df.explode('descriptions')
df_d=pd.concat([df_descriptions.drop(['descriptions'], axis=1), df_descriptions['descriptions'].apply(pd.Series)], axis=1)
df_desc=df_d[["event_id", "description"]]
df_desc
# remving the rows which description is empty
df_desc1=df_desc.dropna(subset=['description']).reset_index()
df_desc1[0:5]
# total number of rows with descriptions
df_desc1.shape[0]
#selecting the description colum
documents=df_desc1["description"].values
#documents
#d=documents[0:100]
d=documents[:]
# Using all-MiniLM-L6-v2 Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
#Training our text_embeddings - using the descriptions available & all-MiniLM-L6-v2 Transformer
text_embeddings = model.encode(d, batch_size = 8, show_progress_bar = True)
np.shape(text_embeddings)
### A small example how to get an embedding vector from a description
first_description=df_desc1["description"].iloc[0]
first_description
first_description_embedding= model.encode(first_description, batch_size = 8, show_progress_bar = True)
similarity_def=cosine_similarity(
[first_description_embedding],
text_embeddings)
similarities = cosine_similarity(text_embeddings)
print('pairwise dense output:\n {}\n'.format(similarities))
similarities_sorted = similarities.argsort()
similarities_sorted
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
p=len(array)
id_1.append(index)
id_2.append(array[-2])
score.append(similarities[index][array[-2]])
index_df = pd.DataFrame({'id_1' : id_1,
'id_2' : id_2,
'score' : score})
print(p)
index_df
## Lets take the document 3
doc_index =3
documents[3]
results={}
for i in range(-2, -12, -1):
similar_index=similarities_sorted[doc_index][i]
rank=similarities[doc_index][similar_index]
results[similar_index]=[rank]
results
Lets find the topic modelling of our descriptions We are going to use the text_embeddings calculated in the previous phase.
len(documents)
topic_model = BERTopic(min_topic_size=20).fit(documents, text_embeddings)
topics, probs = topic_model.transform(documents, text_embeddings)
topic_model.visualize_topics()
#### Visualzing the first 5 keywords of our first 5 topics
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
We should always ignore the first -1 topic.
#Lets see the frequency of the first 10 topics
topic_model.get_topic_freq()[0:10]
print("Number of topics found %s" %len(topic_model.get_topic_freq()))
#topic_model.get_topics()
document_3_topic=topics[3]
print("The topic of the document 3 is %s " %document_3_topic)
topic_model.get_topic(0)
df_desc1[3:4]
topic_model.get_topic(document_3_topic)
Lets starting exploding the schedules column
df["schedules"]
df_schedules=df
df_schedules.rename(columns={'tags':'event_tags'}, inplace=True)
df_schedules.rename(columns={'name':'event_name'}, inplace=True)
df_schedules.rename(columns={'links':'event_links'}, inplace=True)
df_schedules=df.explode('schedules')
#df_schedules
df_s=pd.concat([df_schedules.drop(['schedules'], axis=1), df_schedules['schedules'].apply(pd.Series)], axis=1)
df_s.iloc[0]
df_start=df_s.groupby([pd.to_datetime(df_s['start_ts'])]).size().reset_index()
df_start=df_start.rename(columns={0: "number_of_times"})
df_start=df_start.sort_values(by=['number_of_times'], ascending=False)
df_start.reset_index()
fig = px.histogram(df_start, x='start_ts', y="number_of_times", title="Frequency of Starts Dates Schedules")
fig.show()
df_end=df_s.groupby([pd.to_datetime(df_s['end_ts'])]).size().reset_index()
df_end=df_end.rename(columns={0: "number_of_times"})
df_end=df_end.sort_values(by=['number_of_times'], ascending=False)
df_end.reset_index()
fig = px.histogram(df_end, x='end_ts', y="number_of_times", title="Frequency of End Dates Schedules")
fig.show()
Lets starting exploding the performance column. We can not explode the performance column, if we hadnt have exploded the schedules column before. For that reason, we are using df_s dataframe, which has already exploded the schedules column.
df_s
a=df_s[["event_id", "event_name", "performances", "event_tags", "start_ts", "end_ts", "place_id"]]
df_p=a.explode("performances")
df_p
df_p=pd.concat([df_p.drop(['performances'], axis=1), df_p['performances'].apply(pd.Series)], axis=1)
df_p[0:2]
Now we have to explode the tickets column. We are going to remove the rows which tickets information is empty.
df_p=df_p.dropna(subset=['tickets'])
Since we dont need all the columns, we have selects a few of them.
df_t=df_p[["event_id", "event_name", "descriptions", "event_tags", "tickets", "place_id", "start_ts", "end_ts"]]
df_t[0:5]
df_t1=df_t.explode("tickets")
Now we are going to transform the max, and min prices of tickets to numeric values.
df_tickets=pd.concat([df_t1.drop(['tickets'], axis=1), df_t1['tickets'].apply(pd.Series)], axis=1)
df_tickets['min_price'] = pd.to_numeric(df_tickets['min_price'])
df_tickets['max_price'] = pd.to_numeric(df_tickets['max_price'])
df_tickets['min_price']= df_tickets['min_price'].fillna(0)
df_tickets['max_price']= df_tickets['max_price'].fillna(0)
df_tickets[0:5]
We are working just with max_price.
g_maxp=df_tickets.groupby(['max_price']).size().reset_index()
g_maxp=g_maxp.rename(columns={0: "number_of_times"})
#g_maxp=g_maxp.sort_values(by=['number_of_times'], ascending=False)
free_tickets=g_maxp[0:1]
## Removing FREE TICKETS
g_maxp=g_maxp.drop([0])
###
g_maxp[:]
fig = px.line(g_maxp, x="max_price", y="number_of_times", title='Frequency of price tickets')
fig.show()
print("The number of free tickets is: %s" %free_tickets["number_of_times"].values[0])
tickets_type=df_tickets.groupby(['type']).size().reset_index()
tickets_type=tickets_type.rename(columns={0: "number_of_times"}).sort_values(by=['number_of_times'], ascending=False)
tickets_type
px.histogram(tickets_type, x="type", y="number_of_times", histfunc="sum", color="type", title='Frequency of type tickets')
df_tickets["place_id"]
data="dataset/sample_20180501.json"
with open('dataset/sample_20180501.json', 'r') as f:
data = json.load(f)
print(len(data["places"]))
places=data["places"]
df_places = pd.DataFrame(places)
df_place = df_tickets.merge(df_places, on=['place_id','place_id'])
df_place.shape[0]
df_town=df_place.dropna(subset=['town'])
town=df_town.groupby(['town']).size().reset_index()
town=town.rename(columns={0: "number_of_times"})
town=town.drop([0])
town=town.sort_values(by=['number_of_times'], ascending=False)
town
px.scatter(town, x="town",y='number_of_times', color='number_of_times', size="number_of_times", size_max=60, title="Frequency of Performances per Town")
town_type=df_town.groupby(['town', 'type']).size().reset_index()
town_type=town_type.rename(columns={0: "number_of_times"})
town_type=town_type[town_type["town"]!=""]
town_type=town_type.sort_values(by=['number_of_times'], ascending=False)
town_type
fig = px.scatter(town_type, x='town', y='type', color='number_of_times', title="Frequency of type tickets per town")
fig.show()
px.scatter(town_type, x="town",y='type', color='number_of_times', size="number_of_times", size_max=60, title="Frequency of performances type tickets per town")
a=df_town[["town", "max_price"]]
a=a[a["town"]!=""]
town_price=a.groupby(['town', 'max_price']).size().reset_index()
town_price=town_price.rename(columns={0: "number_of_times"})
town_price=town_price.sort_values(by=['number_of_times'], ascending=False)
town_price
free_town_price=town_price[town_price["max_price"]== 0.0]
free_town_price
fig = px.bar(free_town_price, x='town', y='number_of_times', color='number_of_times', barmode='group', title="Frequency of Free Tickets per Town")
fig.show()
town_price=town_price[town_price["max_price"]!= 0.0]
town_price
fig = px.bar(town_price, x='town', y='max_price', color='number_of_times', barmode='group', title="Frequency of Price Tickets per Town")
fig.show()
town_price.groupby(["town"]).sum().sort_values(by=['max_price'], ascending=False)
scot_towns_price=town_price[town_price['town'].isin(["Edinburgh", "Glasgow", "Perth", "Inverness", "Dundee", "St Andrews", "Aberdeen"])]
scot_towns_price[0:10]
fig = px.bar(scot_towns_price, x='town', y='max_price', color='number_of_times', barmode='group', title="Frequency of Price Tickets per Scottish City")
fig.show()
scot_towns_price.groupby(["town"]).sum().sort_values(by=['max_price'], ascending=False)
scot_towns_type=town_type[town_type['town'].isin(["Edinburgh", "Glasgow", "Perth", "Inverness", "Dundee", "St Andrews", "Aberdeen"])]
scot_towns_type[0:10]
fig = px.bar(scot_towns_type, x='town', y='number_of_times', color='type', barmode='group', title="Frequency of Type Tickets per Scottish City")
fig.show()
scot_towns_type.groupby(["town"]).sum()
df_place.loc[0]
df_place2=df_place.dropna(subset=['town'])
df_place2
df_scott=df_place2[df_place2['town'].isin(["Edinburgh", "Glasgow", "Perth", "Inverness", "Dundee", "St Andrews", "Aberdeen"])]
df_scott=df_scott[["event_id", "event_name", "event_tags", "town", "start_ts", "end_ts"]]
df_scott[0:3]
Note: An event can have several schedules. And a schedule has an starting and end date. Therefore, an event can have several starting and end dates.
fig = px.scatter(df_scott, x='start_ts', y="event_name", title="Frequency of starting date per event in Scottish cities")
fig.show()
fig = px.scatter(df_scott, x='end_ts', y="event_name", title="Frequency of ending date per event in Scottish cities")
fig.show()
scott_schedule=df_scott.groupby(['event_name', 'town']).size().reset_index()
scott_schedule=scott_schedule.rename(columns={0: "number_of_times"})
scott_schedule=scott_schedule.sort_values(by=['number_of_times'], ascending=False)
scott_schedule
t=scott_schedule.groupby(["event_name"]).sum().sort_values(by=['number_of_times'], ascending=False)
t
fig = px.bar(t, title="Frequency of Schedules per event")
fig.show()
a=df_scott.reset_index(drop=True)
tags_town=a[["event_tags", "town"]]
tags_town=tags_town.explode("event_tags")
tags_town
scott_tag=tags_town.groupby(['town', 'event_tags']).size().reset_index()
scott_tag=scott_tag.rename(columns={0: "number_of_times"})
scott_tag=scott_tag.sort_values(by=['number_of_times'], ascending=False)
scott_tag
fig=px.histogram(scott_tag, x="town", y="number_of_times", histfunc="sum", color="event_tags", title='Frequency of tags in Scottish Cities')
fig.update_layout(legend_traceorder="reversed")
fig.show()
t=scott_tag.groupby(["event_tags"]).sum().sort_values(by=['number_of_times'], ascending=False)
t
edi_scott_tag=scott_tag[scott_tag['town'].isin(["Edinburgh"])]
edi_scott_tag
edi_scott_tag.groupby(["event_tags"]).sum().sort_values(by=['number_of_times'], ascending=False)
fig = px.bar(edi_scott_tag, x='town', y='number_of_times', color='event_tags', barmode='group', title="Frequency of schedules tags for Edinburgh")
fig.show()
scott_start=df_scott.groupby([pd.to_datetime(df_scott['start_ts']), "town"]).size().reset_index()
scott_start=scott_start.rename(columns={0: "number_of_times"})
scott_start=scott_start.sort_values(by=['number_of_times'], ascending=False)
scott_start.reset_index()
ed_scott_start=scott_start[scott_start['town'].isin(["Edinburgh"])].reset_index()
ed_scott_start.groupby(["start_ts"]).sum().sort_values(by=['number_of_times'], ascending=False)
#fig = px.bar(ed_scott_start, x='town', y='number_of_times', color='start_ts', barmode='group', title="Frequency of starting date schedules for Edinburgh")
#fig.show()
scott_end=df_scott.groupby([pd.to_datetime(df_scott['end_ts']), "town"]).size().reset_index()
scott_end=scott_end.rename(columns={0: "number_of_times"})
scott_end=scott_end.sort_values(by=['number_of_times'], ascending=False)
scott_end.reset_index()
ed_scott_end=scott_end[scott_end['town'].isin(["Edinburgh"])].reset_index()
ed_scott_end.groupby(["end_ts"]).sum().sort_values(by=['number_of_times'], ascending=False)
#fig = px.bar(ed_scott_end, x='town', y='number_of_times', color='end_ts', barmode='group', title="Frequency of ending date schedules for Edinburgh")
#fig.show()
fig = px.histogram(ed_scott_start, x='start_ts', y="number_of_times", title="Histogram of Schedules Starting Dates for Edinburgh")
fig.show()
fig = px.histogram(scott_start, x='start_ts', y="number_of_times", title="Histogram of Schedules Starting Dates for Scottish Cities")
fig.show()
fig = px.histogram(scott_end, x='end_ts', y="number_of_times", title="Histogram of Schedules Ending Dates for Scottish Cities")
fig.show()
fig = px.histogram(scott_end, x="end_ts", y="number_of_times", histfunc="sum", title="Histogram on Date Axes")
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.1)
fig.add_trace(go.Scatter(mode="markers", x=scott_end["end_ts"], y=scott_end["number_of_times"], name="daily"))
fig.show()
b=df_scott.reset_index(drop=True)
tag_town_time=b[["event_tags", "town", "start_ts", "end_ts"]]
tag_town_time=tag_town_time.explode("event_tags")
tag_town_time
scott_tag_end=tag_town_time.groupby([pd.to_datetime(tag_town_time['end_ts']), "event_tags"]).size().reset_index()
scott_tag_end=scott_tag_end.rename(columns={0: "number_of_times"})
scott_tag_end=scott_tag_end.sort_values(by=['number_of_times'], ascending=False)
scott_tag_start=tag_town_time.groupby([pd.to_datetime(tag_town_time['start_ts']), "event_tags"]).size().reset_index()
scott_tag_start=scott_tag_start.rename(columns={0: "number_of_times"})
scott_tag_start=scott_tag_start.sort_values(by=['number_of_times'], ascending=False)
scott_tag_start
#fig = px.bar(scott_tag_start, x='event_tags', y='start_ts', color='number_of_times', barmode='group', title="Frequency of schedules tags per Scottish City")
#fig.show()
fig = px.scatter(scott_tag_start, x='start_ts', y='number_of_times', title="Frequency of schedules Starting Date in Scottish City.")
fig.show()
fig = px.scatter(scott_tag_end, x='end_ts', y='number_of_times', title="Frequency of schedules Ending Date in Scottish City.")
fig.show()
fig = px.scatter(scott_tag_start, x='start_ts', y='event_tags', title="Scheduled Tags and Starting Dates in Scottish City.")
fig.show()
fig = px.scatter(scott_tag_end, x='end_ts', y='event_tags', title="Scheduled Tags and Ending Dates in Scottish City.")
fig.show()